This notebook processes multiple CSV files containing time-series amplitude measurements and combines them into a single dataset. The code loops through all the CSVs in the same folder that contain 'time_' in the file names (that look like below left in Excel) and combines them into a file called raw_data.csv (that looks like below right in Excel). It also generates and saves to first/last reading images (one truncated and one untruncated).
In [1]:
generate_plot = True
In [2]:
# imports
import pandas as pd
import os
import shutil
import glob
from re import findall, compile
import datetime
import ctypes
In [3]:
# define working directory
try:
workingDirectory = os.path.dirname(__file__)
except:
workingDirectory = str(os.getcwd())
In [4]:
# create list all the CSVs in the folder containing 'time_' in file name
os.chdir(workingDirectory)
list_of_csvs = glob.glob('*.{}'.format("csv"))
regex = compile(r'time_'); list_of_csvs = [i for i in list_of_csvs if regex.search(i)]
print("The following CSVs will be looped through...")
print(list_of_csvs)
The following CSVs will be looped through... ['time_0xZn_1_Ch1_240401_094500.csv', 'time_0xZn_1_Ch1_240402_161000.csv', 'time_0xZn_1_Ch1_240403_091000.csv', 'time_0xZn_1_Ch1_240404_093000.csv', 'time_0xZn_1_Ch1_240405_093000.csv', 'time_0xZn_1_Ch1_240408_094500.csv', 'time_0xZn_1_Ch1_240517_131500.csv']
In [5]:
# define list to store width (number of columns) for each dataframe (these should always be equal)
df_widths = []
In [6]:
# create distances dataframe (same for every file in folder)
df_distances = pd.read_csv(list_of_csvs[0], header=None, names=["Distance","Amplitude"]).T
df_distances = df_distances[0:1]
df_distances = df_distances.reset_index()
df_distances_width = df_distances.shape[1]
df_widths.append(df_distances_width)
In [7]:
# loop through the files, transpose, and add single rows to the df_all_amplitures dataframe
df_all_amplitudes = pd.DataFrame()
num_csvs_looped_though = 0
for current_csv_file in list_of_csvs:
print("Processing: " + current_csv_file + " (" + str(num_csvs_looped_though+1) + " of " + str(len(list_of_csvs)) + ")")
# get timestemp
try:
datetime_from_file_name = findall(r'_([0-9]{6}_[0-9]{5,})\.csv', current_csv_file)[0]
except:
ctypes.windll.user32.MessageBoxW(0, "The are CSV files (excluding raw_data.csv) that do not match the regular expression below:\n\n_([0-9]{6}_[0-9]{5,})\.csv", "Error", 1)
year = datetime_from_file_name[0:2]
month = datetime_from_file_name[2:4]
day = datetime_from_file_name[4:6]
time = findall(r'_([0-9]{5,})\.csv', current_csv_file)[0]
hour = time[0:2]
minute = time[2:4]
full_timestamp = "20" + year + "-" + month + "-" + day + "T" + hour + ":" + minute + ":00+00:00"
# read and transpose CSV
df_amplitude = pd.read_csv(current_csv_file, header=None, names=["Distance","Amplitude"]).T
df_amplitude = df_amplitude[1:2]
df_amplitude = df_amplitude.reset_index()
df_amplitude_width = df_amplitude.shape[1]
df_widths.append(df_distances_width)
# replace first (leftmost) value with timestamp
df_amplitude['index'] = df_amplitude['index'].replace(['Amplitude'],full_timestamp)
df_all_amplitudes = pd.concat([df_all_amplitudes, df_amplitude], ignore_index=True)
# add to counter
num_csvs_looped_though = num_csvs_looped_though + 1
Processing: time_0xZn_1_Ch1_240401_094500.csv (1 of 7) Processing: time_0xZn_1_Ch1_240402_161000.csv (2 of 7) Processing: time_0xZn_1_Ch1_240403_091000.csv (3 of 7) Processing: time_0xZn_1_Ch1_240404_093000.csv (4 of 7) Processing: time_0xZn_1_Ch1_240405_093000.csv (5 of 7) Processing: time_0xZn_1_Ch1_240408_094500.csv (6 of 7) Processing: time_0xZn_1_Ch1_240517_131500.csv (7 of 7)
In [8]:
# sort dataframe (so earliest readings are at top)
df_all_amplitudes = df_all_amplitudes.sort_values(['index'], ascending = [True])
In [9]:
# union distances (top row only) and all amplitudes dataframes
df_final = pd.concat([df_distances, df_all_amplitudes], ignore_index=True)
df_final['index'] = df_final['index'].replace(['Distance'],'')
df_final.head()
Out[9]:
| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | 1198 | 1199 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.107207 | 0.214414 | 0.321622 | 0.428829 | 0.536036 | 0.643243 | 0.750450 | 0.857658 | ... | 127.576577 | 127.683784 | 127.790991 | 127.898198 | 128.005405 | 128.112613 | 128.219820 | 128.327027 | 128.434234 | 128.541441 | |
| 1 | 2024-04-01T09:45:00+00:00 | 0.069553 | 0.036079 | 0.016572 | 0.010159 | 0.018783 | 0.014729 | 0.012764 | 0.014193 | 0.021974 | ... | 0.001055 | 0.002097 | 0.001326 | 0.001961 | 0.006971 | 0.011156 | 0.009833 | 0.008984 | 0.014437 | 0.031000 |
| 2 | 2024-04-02T16:10:00+00:00 | 0.063265 | 0.036869 | 0.016237 | 0.010420 | 0.018407 | 0.014863 | 0.012432 | 0.013837 | 0.021768 | ... | 0.000661 | 0.001665 | 0.000946 | 0.002118 | 0.006235 | 0.009632 | 0.009815 | 0.008587 | 0.015554 | 0.030573 |
| 3 | 2024-04-03T09:10:00+00:00 | 0.048215 | 0.032925 | 0.016320 | 0.010914 | 0.016880 | 0.014081 | 0.012564 | 0.013948 | 0.022045 | ... | 0.000173 | 0.001283 | 0.000691 | 0.001887 | 0.005185 | 0.006122 | 0.009726 | 0.010025 | 0.015649 | 0.032935 |
| 4 | 2024-04-04T09:30:00+00:00 | 0.070546 | 0.037625 | 0.017032 | 0.009807 | 0.019065 | 0.014699 | 0.012599 | 0.013919 | 0.021953 | ... | 0.000979 | 0.001985 | 0.001038 | 0.001870 | 0.006644 | 0.011179 | 0.009658 | 0.008618 | 0.014939 | 0.030755 |
5 rows × 1201 columns
In [10]:
# check dataframe widths are equal
df_widths = list(set(df_widths))
if len(df_widths) == 1:
print("Dataframe widths are equal, no error")
else:
ctypes.windll.user32.MessageBoxW(0, "Dataframe widths are unequal, all CSV files should have the same number of rows.", "Error", 1)
Dataframe widths are equal, no error
In [ ]:
# output CSV
output_file_name = "raw_data.csv"
try:
os.remove(output_file_name)
except:
pass
df_final.to_csv(output_file_name, index=False, header=False)
In [12]:
# generate and save plots
if generate_plot == True:
import matplotlib.pyplot as plt
def make_plot(truncate_dist):
df_final_t = df_final.T.reset_index()
df_final_t.columns = df_final_t.iloc[0]
df_final_t = df_final_t[1:]
df_final_t = df_final_t.rename(columns={df_final_t.columns[1]: 'Distance'})
df_final_t = df_final_t.drop(['index'], axis=1)
df_final_t = df_final_t.iloc[:, [0, 1, -1]] # get first, second, and last columns
if truncate_dist == True:
max_index_first = pd.to_numeric(df_final_t.iloc[:, 1], errors='coerce')
max_index_first = max_index_first.idxmax()
max_index_last = pd.to_numeric(df_final_t.iloc[:, 2], errors='coerce')
max_index_last = max_index_last.idxmax()
maxs_added = max_index_first + max_index_last
df_final_t = df_final_t.iloc[:maxs_added, :]
formatted_time_first = df_final_t.columns[1]
formatted_time_first= formatted_time_first.replace('T',' ')
formatted_time_first = formatted_time_first.replace(':00+00:00','')
formatted_time_last = df_final_t.columns[2]
formatted_time_last= formatted_time_last.replace('T',' ')
formatted_time_last = formatted_time_last.replace(':00+00:00','')
plt.figure(figsize=(14, 8))
# Plot first recent reading
plt.scatter(df_final_t['Distance'], df_final_t.iloc[:, 1], label='First reading')
plt.plot(df_final_t['Distance'], df_final_t.iloc[:, 1], linestyle='-', linewidth=0.5, color='blue', alpha=0.5)
# Plot most recent reading
plt.scatter(df_final_t['Distance'], df_final_t.iloc[:, 2], label='Most recent reading')
plt.plot(df_final_t['Distance'], df_final_t.iloc[:, 2], linestyle='-', linewidth=0.5, color='red', alpha=0.5)
plt.xlabel(df_final_t.columns[0]) # Label for x-axis (first column name)
plt.ylabel("Amplitude") # Label for y-axis (second column name)
plt.ylim(bottom=0)
chart_title = "Amplitudes at " + formatted_time_first + " (first reading) and " + formatted_time_last + " (most recent reading)"
if truncate_dist == True:
chart_title = chart_title + " [DISTANCE TRUNCATED]"
plt.title(chart_title)
plt.legend()
image_title = "Most recent and first readings"
if truncate_dist == True:
image_title = image_title + " (distance truncated)"
image_title = image_title + ".png"
try:
os.remove(image_title)
except:
pass
plt.savefig(image_title, format='png', dpi=300)
#plt.show()
make_plot(truncate_dist=False)
try:
make_plot(truncate_dist=True)
except:
pass